// A complete lexer and grammar for CSS 2.1 as defined by the
// W3 specification.
//
// This grammar is free to use providing you retain everyhting in this header comment
// section.
//
// Author      : Jim Idle, Temporal Wave LLC.
// Contact     : jimi@temporal-wave.com
// Website     : http://www.temporal-wave.com
// License     : ANTLR Free BSD License
//
// Please visit our Web site at http://www.temporal-wave.com and try our commercial
// parsers for SQL, C#, VB.Net and more.
//
// This grammar is free to use providing you retain everything in this header comment
// section.
//
grammar css21;

// -------------
// Main rule.	This is the main entry rule for the parser, the top level
//				grammar rule.
//
// A style sheet consists of an optional character set specification, an optional series
// of imports, and then the main body of style rules.
//
styleSheet	
	: 	charSet
		imports*
		bodylist
	 EOF
	;
	
// -----------------
// Character set.	Picks up the user specified character set, should it be present.
//
charSet
	: 	CHARSET_SYM STRING SEMI
	|
	;

// ---------
// Import.	Location of an external style sheet to include in the ruleset.
//
imports
	:	IMPORT_SYM (STRING|URI)	(medium (COMMA medium)*)? SEMI
	;

// ---------
// Media.	Introduce a set of rules that are to be used if the consumer indicates
//			it belongs to the signified medium.
//
media
	: MEDIA_SYM medium (COMMA medium)*
		LBRACE
			ruleSet
		RBRACE
	;

// ---------	
// Medium.	The name of a medim that are particulare set of rules applies to.
//
medium
	: IDENT	
	;
	

bodylist
	: bodyset*
	;
	
bodyset
	: ruleSet
	| media
	| page
	;	
	
page
	: PAGE_SYM pseudoPage?
		LBRACE
			declaration SEMI (declaration SEMI)*
		RBRACE
	;
	
pseudoPage
	: COLON IDENT
	;
	
operator
	: SOLIDUS
	| COMMA
	|
	;
	
combinator
	: PLUS
	| GREATER
	|
	;
	
unaryOperator
	: MINUS
	| PLUS
	;  
	
property
	: IDENT
	;
	
ruleSet
	: selector (COMMA selector)*
		LBRACE
			declaration SEMI (declaration SEMI)*
		RBRACE
	;
	
selector
	: simpleSelector (combinator simpleSelector)*
	;

simpleSelector
	: elementName 
		((esPred)=>elementSubsequent)*
		
	| ((esPred)=>elementSubsequent)+
	;
	
esPred
	: HASH | DOT | LBRACKET | COLON
	;
	
elementSubsequent
	: HASH
	| cssClass
	| attrib
	| pseudo
	;
	
cssClass
	: DOT IDENT
	;
	
elementName
	: IDENT
	| STAR
	;
	
attrib
	: LBRACKET
	
		IDENT
		
			(
				(
					  OPEQ
					| INCLUDES
					| DASHMATCH
				)
				(
					  IDENT
					| STRING
				)		
			)?
	
	  RBRACKET
;

pseudo
	: COLON 
			IDENT
				( // Function
				
					LPAREN IDENT? RPAREN
				)?
	;

declaration
	: property COLON expr prio?
	;
	
prio
	: IMPORTANT_SYM
	;
	
expr
	: term (operator term)*
	;
	
term
	: unaryOperator?
		(
			  NUMBER
			| PERCENTAGE
			| LENGTH
			| EMS
			| EXS
			| ANGLE
			| TIME
			| FREQ
		)
	| STRING
	| IDENT (	// Function
				LPAREN expr RPAREN
			)?
	| URI
	| hexColor
	;
	
hexColor
	: HASH
	;
	
// ==============================================================
// LEXER
//
// The lexer follows the normative section of WWW standard as closely
// as it can. For instance, where the ANTLR lexer returns a token that
// is unambiguous for both ANTLR and lex (the standard defines tokens
// in lex notation), then the token names are equivalent.
//
// Note however that lex has a match order defined as top to bottom 
// with longest match first. This results in a fairly inefficent, match,
// REJECT, match REJECT set of operations. ANTLR lexer grammars are actaully
// LL grammars (and hence LL recognizers), which means that we must
// specifically disambiguate longest matches and so on, when the lex
// like normative grammar results in ambiguities as far as ANTLR is concerned.
//
// This means that some tokens will either be combined compared to the
// normative spec, and the paresr will recognize them for what they are.
// In this case, the token will named as XXX_YYY where XXX and YYY are the
// token names used in the specification.
//
// Lex style macro names used in the spec may sometimes be used (in upper case
// version) as fragment rules in this grammar. However ANTLR fragment rules
// are not quite the same as lex macros, in that they generate actual 
// methods in the recognizer class, and so may not be as effecient. In
// some cases then, the macro contents are embedded. Annotation indicate when
// this is the case.
//
// See comments in the rules for specific details.
// --------------------------------------------------------------
//
// N.B. CSS 2.1 is defined as case insensitive, but because each character
//      is allowed to be written as in escaped form we basically define each
//      character as a fragment and reuse it in all other rules.
// ==============================================================


// --------------------------------------------------------------
// Define all the fragments of the lexer. These rules neither recognize
// nor create tokens, but must be called from non-fragment rules, which
// do create tokens, using these fragments to either purely define the
// token number, or by calling them to match a certain portion of
// the token string.
//

fragment	HEXCHAR		: ('a'..'f'|'A'..'F'|'0'..'9')	;

fragment	NONASCII	: '\u0080'..'\uFFFF'			;	// NB: Upper bound should be \u4177777

fragment	UNICODE		: '\\' HEXCHAR 
								(HEXCHAR 
									(HEXCHAR 
										(HEXCHAR 
											(HEXCHAR HEXCHAR?)?
										)?
									)?
								)? 
								('\r'|'\n'|'\t'|'\f'|' ')*	;
								
fragment	ESCAPE		: UNICODE | '\\' ~('\r'|'\n'|'\f'|HEXCHAR)	;

fragment	NMSTART		: '_'
						| 'a'..'z'
						| 'A'..'Z'
						| NONASCII
						| ESCAPE
						;

fragment	NMCHAR		: '_'
						| 'a'..'z'
						| 'A'..'Z'
						| '0'..'9'
						| '-'
						| NONASCII
						| ESCAPE
						;
						
fragment	NAME		: NMCHAR+	;

fragment	URL			: ( 
							'!'|'#'|'$'|'%'|'&'
                                                        |'*'..'~'
							| NONASCII
							| ESCAPE
						  )*
						;

						
// Basic Alpha characters in upper, lower and escaped form. Note that
// whitespace and newlines are unimportant even within keywords. We do not
// however call a further fragment rule to consume these characters for
// reasons of performance - the rules are still eminently readable.
//
fragment	A	:	('a'|'A') ('\r'|'\n'|'\t'|'\f'|' ')*	
				| 	'\\' ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'1'
				;
fragment	B	:	('b'|'B') ('\r'|'\n'|'\t'|'\f'|' ')*	
				| 	'\\' ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'2'
				;
fragment	C	:	('c'|'C') ('\r'|'\n'|'\t'|'\f'|' ')*	
				| 	'\\' ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'3'
				;
fragment	D	:	('d'|'D') ('\r'|'\n'|'\t'|'\f'|' ')*	
				| 	'\\' ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'4'
				;
fragment	E	:	('e'|'E') ('\r'|'\n'|'\t'|'\f'|' ')*	
				| 	'\\' ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'5'
				;
fragment	F	:	('f'|'F') ('\r'|'\n'|'\t'|'\f'|' ')*	
				| 	'\\' ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'6'
				;
fragment	G	:	('g'|'G') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| 	'\\'
						(
							  'g'
							| 'G'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'7'
						)
				;
fragment	H	:	('h'|'H') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'h'
							| 'H'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'8'
						)	
				;
fragment	I	:	('i'|'I') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'i'
							| 'I'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')'9'
						)
				;
fragment	J	:	('j'|'J') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'j'
							| 'J'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')('A'|'a')
						)	
				;
fragment	K	:	('k'|'K') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'k'
							| 'K'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')('B'|'b')
						)	
				;
fragment	L	:	('l'|'L') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'l'
							| 'L'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')('C'|'c')
						)	
				;
fragment	M	:	('m'|'M') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'm'
							| 'M'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')('D'|'d')
						)	
				;
fragment	N	:	('n'|'N') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'n'
							| 'N'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')('E'|'e')
						)	
				;
fragment	O	:	('o'|'O') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'o'
							| 'O'
							| ('0' ('0' ('0' '0'?)?)?)? ('4'|'6')('F'|'f')
						)	
				;
fragment	P	:	('p'|'P') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\'
						(
							  'p'
							| 'P'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('0')
						)	
				;
fragment	Q	:	('q'|'Q') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'q'
							| 'Q'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('1')
						)	
				;
fragment	R	:	('r'|'R') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'r'
							| 'R'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('2')
						)	
				;
fragment	S	:	('s'|'S') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  's'
							| 'S'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('3')
						)	
				;
fragment	T	:	('t'|'T') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  't'
							| 'T'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('4')
						)	
				;
fragment	U	:	('u'|'U') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'u'
							| 'U'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('5')
						)
				;
fragment	V	:	('v'|'V') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(	  'v'
							| 'V'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('6')
						)
				;
fragment	W	:	('w'|'W') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'w'
							| 'W'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('7')
						)	
				;
fragment	X	:	('x'|'X') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'x'
							| 'X'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('8')
						)
				;
fragment	Y	:	('y'|'Y') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'y'
							| 'Y'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('9')
						)
				;
fragment	Z	:	('a'|'Z') ('\r'|'\n'|'\t'|'\f'|' ')* 
				| '\\' 
						(
							  'z'
							| 'Z'
							| ('0' ('0' ('0' '0'?)?)?)? ('5'|'7')('A'|'a')
						)
				;


// -------------
// Comments.	Comments may not be nested, may be multilined and are delimited
//				like C comments: /* ..... */
//				COMMENTS are hidden from the parser which simplifies the parser 
//				grammar a lot.
//
COMMENT			: '/*' ( options { greedy=false; } : .*) '*/'
	
					{
						$channel = 2;	// Comments on channel 2 in case we want to find them
					}
				;

// ---------------------
// HTML comment open.	HTML/XML comments may be placed around style sheets so that they
//						are hidden from higher scope parsing engines such as HTML parsers.
//						They comment open is therfore ignored by the CSS parser and we hide
//						it from the ANLTR parser.
//
CDO				: '<!--'

					{
						$channel = 3;	// CDO on channel 3 in case we want it later
					}
				;
	
// ---------------------			
// HTML comment close.	HTML/XML comments may be placed around style sheets so that they
//						are hidden from higher scope parsing engines such as HTML parsers.
//						They comment close is therfore ignored by the CSS parser and we hide
//						it from the ANLTR parser.
//
CDC				: '-->'

					{
						$channel = 4;	// CDC on channel 4 in case we want it later
					}
				;
				
INCLUDES		: '~='		;
DASHMATCH		: '|='		;

GREATER			: '>'		;
LBRACE			: '{'		;
RBRACE			: '}'		;
LBRACKET		: '['		;
RBRACKET		: ']'		;
OPEQ			: '='		;
SEMI			: ';'		;
COLON			: ':'		;
SOLIDUS			: '/'		;
MINUS			: '-'		;
PLUS			: '+'		;
STAR			: '*'		;
LPAREN			: '('		;
RPAREN			: ')'		;
COMMA			: ','		;
DOT				: '.'		;

// -----------------
// Literal strings.	Delimited by either ' or "
//
fragment	INVALID	:;
STRING			: '\'' ( ~('\n'|'\r'|'\f'|'\'') )* 
					(
						  '\''
						| { $type = INVALID; }
					)
					
				| '"' ( ~('\n'|'\r'|'\f'|'"') )*
					(
						  '"'
						| { $type = INVALID; }
					)
				;

// -------------
// Identifier. 	Identifier tokens pick up properties names and values
//
IDENT			: '-'? NMSTART NMCHAR*	;

// -------------
// Reference.	Reference to an element in the body we are styling, such as <XXXX id="reference">
//
HASH			: '#' NAME				;

IMPORT_SYM		: '@' I M P O R T		;
PAGE_SYM		: '@' P A G E			;
MEDIA_SYM		: '@' M E D I A			;
CHARSET_SYM		: '@charset '			;

IMPORTANT_SYM	: '!' (WS|COMMENT)* I M P O R T A N T	;

// ---------
// Numbers.	Numbers can be followed by pre-known units or unknown units
//			as well as '%' it is a precentage. Whitespace cannot be between
//			the numebr and teh unit or percent. Hence we scan any numeric, then
//			if we detect one of the lexical sequences for unit tokens, we change
//			the lexical type dynamically.
//
//			Here we first define the various tokens, then we implement the
//			number parsing rule.
//
fragment	EMS			:;	// 'em'
fragment	EXS			:;	// 'ex'
fragment	LENGTH		:;	// 'px'. 'cm', 'mm', 'in'. 'pt', 'pc'
fragment	ANGLE		:;	// 'deg', 'rad', 'grad'
fragment	TIME		:;	// 'ms', 's'
fragment	FREQ		:;	// 'khz', 'hz'
fragment	DIMENSION	:;	// nnn'Somethingnotyetinvented'
fragment	PERCENTAGE	:;	// '%'

NUMBER
	:	(
			  '0'..'9' ('.' '0'..'9'+)?
			| '.' '0'..'9'+
		)
		(
			  (E (M|X))=>
			  	E
			  	(
			  		  M 	{ $type = EMS;			}
			  		| X		{ $type = EXS;			}
			  	)
			| (P(X|T|C))=>
				P
				(
					  X		
					| T
					| C
				)
							{ $type = LENGTH;		}	
			| (C M)=>
				C M			{ $type = LENGTH;		}
			| (M (M|S))=> 
				M
				(
					  M 	{ $type = LENGTH;		}
			
					| S 	{ $type = TIME;			}
				)
			| (I N)=>
				I N			{ $type = LENGTH;		}
			
			| (D E G)=>
				D E G		{ $type = ANGLE;		}
			| (R A D)=>
				R A D		{ $type = ANGLE;		}
			
			| (S)=>S		{ $type = TIME;			}
				
			| (K? H	Z)=>
				K? H	Z	{ $type = FREQ;			}
			
			| IDENT			{ $type = DIMENSION;	}
			
			| '%'			{ $type = PERCENTAGE;	}
			
			| // Just a number
		)
	;

// ------------
// url and uri.
//
URI	:	U R L
		'('
			((WS)=>WS)? (URL|STRING) WS?
		')'
	;

// -------------
// Whitespace. 	Though the W3 standard shows a Yacc/Lex style parser and lexer
//              that process the whitespace within the parser, ANTLR does not
//				need to deal with the whitespace directly in the parser.
//
WS		: (' '|'\t')+			{ $channel = HIDDEN;	}	;
NL		: ('\r' '\n'? | '\n')	{ $channel = HIDDEN;	}	;


// -------------
//  Illegal.	Any other character shoudl not be allowed.
//
